In [ ]:
from __future__ import print_function
import numpy as np
from sklearn import __version__ as sklearn_version
print('Sklearn version:', sklearn_version)
The 20 newsgroups dataset comprises around 18000 newsgroups posts on 20 topics split in two subsets: one for training (or development) and the other one for testing (or for performance evaluation). The split between the train and test set is based upon a messages posted before and after a specific date.
In [ ]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
remove=('headers', 'footers', 'quotes'),
categories=categories, shuffle=True, random_state=42)
twenty_train.target_names
In [ ]:
# Sample data
print(twenty_train.data[0])
print('---------------')
print('Target: ', twenty_train.target[0])
In [ ]:
# Text preprocessing, tokenizing and filtering of stopwords
from sklearn.feature_extraction.text import CountVectorizer
#Define the count vectorizer here
...
# Fit and apply the count vectorizer with the train texts.
X_train_counts = ...
X_train_counts.shape
In [ ]:
# Visualize the results, first row and first column
print(X_train_counts[0,:])
print(X_train_counts[:,0])
In [ ]:
#From occurrences to frequencies
from sklearn.feature_extraction.text import TfidfTransformer
# Define the TF-IDF transformer here and fit it
tfidf_transformer = ...
# Apply the transformer to the train matrix of documents terms.
X_train_tf = ...
X_train_tf.shape
In [ ]:
# Visualize the results, first row and first column
print(X_train_tf[0,:])
print(X_train_tf[:,0])
In [ ]:
from sklearn.naive_bayes import MultinomialNB
# Define and fit in one line
clf = MultinomialNB().fit(X_train_tf, twenty_train.target)
In [ ]:
#Score test data
# Read test data
twenty_test = fetch_20newsgroups(subset='test',
remove=('headers', 'footers', 'quotes'),
categories=categories, shuffle=True, random_state=42)
# Transform text to counts
X_test_counts = ...
# tf-idf transformation
X_test_tf = ...
# Prediction
predicted = ...
# Accuracy
from sklearn.metrics import accuracy_score
print('Accuracy test: ', accuracy_score(twenty_test.target, predicted))
In [ ]:
In [ ]:
#Define the pipeline
from sklearn.pipeline import Pipeline
text_clf = Pipeline(...)
# Fit all the pipeline
text_clf.fit(twenty_train.data, twenty_train.target)
In [ ]:
#Evaluate test data
twenty_test = fetch_20newsgroups(subset='test',
remove=('headers', 'footers', 'quotes'),
categories=categories,
shuffle=True, random_state=42)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)
In [ ]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42)),
])
#Fit
_ = text_clf.fit(twenty_train.data, twenty_train.target)
# Predict
predicted = text_clf.predict(twenty_test.data)
# Evaluate accuracy
np.mean(predicted == twenty_test.target)
In [ ]:
In [ ]:
from sklearn import svm
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=5000, stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC()),
])
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)
In [ ]:
In [ ]:
from sklearn.model_selection import RandomizedSearchCV
# Define estimator. No parameters of the search
clf = Pipeline([('vect', ...),
('tfidf', ...),
('clf', ...),
])
# Specify parameters and distributions to sample from
# Parameters of pipelines can be set using ‘__’ separated parameter names:
param_dist = {"vect__max_features": ...,
"vect__stop_words": ...,
"clf__C": ...}
# Define randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(...)
# Run the randomized search
random_search.fit(twenty_train.data, twenty_train.target)
print("Done!")
In [ ]:
# Load dictionary of search results to a Pandas dataframe
import pandas as pd
df_cv_results = pd.DataFrame.from_dict(random_search.cv_results_)
df_cv_results
In [ ]:
# Score & evaluate test data using the best estimator
text_clf_svm = Pipeline([('vect', CountVectorizer(max_df=0.95, min_df=2, max_features=10000, stop_words='english')),
('tfidf', TfidfTransformer()),
('clf', svm.LinearSVC(C=1.5)),
])
_ = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted = text_clf_svm.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)
In [ ]:
In [ ]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target,
predicted,
target_names=twenty_test.target_names))
In [ ]:
metrics.confusion_matrix(twenty_test.target, predicted)